# coding=utf-8

from lxml import etree
import urllib2
from bs4 import BeautifulSoup

url = 'http://blog.csdn.net/yuetiantian/'
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0')
response = urllib2.urlopen(req)
the_page = response.read()

print the_page
#用beautifulsoup提取页面内容
soup = BeautifulSoup(the_page)
#print soup.title.string #页面title
print soup.find_all("li")

xpathStr = '/html/body/div/div[3]/div[2]/div[1]/div[4]/ul[2]/li/a' #获得页面左边的分类列表
root = etree.HTML(the_page)
links = root.xpath(xpathStr)
for link in links:
    print link
    print link.attrib['href']